// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#pragma once

#include "glog/logging.h"

#include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/core/utils/data_type.h"
#include "paddle/phi/kernels/complex_kernel.h"
#include "paddle/phi/kernels/elementwise_add_kernel.h"
#include "paddle/phi/kernels/elementwise_multiply_kernel.h"
#include "paddle/phi/kernels/full_kernel.h"
#include "paddle/phi/kernels/funcs/complex_functors.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/phi/kernels/funcs/matrix_inverse.h"
#include "paddle/phi/kernels/funcs/unsqueeze.h"
#include "paddle/phi/kernels/impl/determinant_grad_kernel_impl.h"
#include "paddle/phi/kernels/impl/isfinite_kernel_impl.h"
#include "paddle/phi/kernels/slogdeterminant_grad_kernel.h"
#include "paddle/phi/kernels/transpose_kernel.h"

namespace phi {

template <typename T, typename Context>
void SlogDeterminantGradKernel(const Context& dev_ctx,
                               const DenseTensor& x,
                               const DenseTensor& out,
                               const DenseTensor& out_grad,
                               DenseTensor* x_grad) {
  if (x_grad && x_grad->numel() == 0) {
    dev_ctx.template Alloc<T>(x_grad);
    return;
  }
  PADDLE_ENFORCE_EQ(
      out_grad.dims()[0],
      2,
      errors::InvalidArgument("The grad tensor of SlogDet should contain two"
                              " grad: sign and absslogdet, but here %ld.",
                              out_grad.dims()[0]));
  if (x.dims().size() > 2) {
    PADDLE_ENFORCE_EQ(
        out_grad.dims().size() + 1,
        x.dims().size(),
        errors::InvalidArgument(
            "The grad tensor of slogdet dims size should 1 less than"
            " input tensor's, but here differ %d",
            x.dims().size() - out_grad.dims().size()));
  }

  // Check Whether the matrix is invertible
  // (matrix A not invertible) == (absslogdet(A)=0)
  auto slogdet_vec = out.Split(1, 0);
  auto absslogdet_val = slogdet_vec[0];
  if (!detail::CheckMatrixInvertible<T, Context>(dev_ctx, &absslogdet_val)) {
    // The matrix is not invertible
    VLOG(3) << "The input matrix not invertible!";
    x_grad->Resize(x.dims());
    phi::Full<T>(dev_ctx,
                 common::vectorize(x.dims()),
                 std::numeric_limits<T>::quiet_NaN(),
                 x_grad);
    return;
  }

  // The matrix is invertible
  // let sl|A| = SlogDeterminant(A)
  // Ref to https://people.maths.ox.ac.uk/gilesm/files/NA-08-01.pdf
  // we set dsl|A| = unsqueeze(dslA, [-1, -2]) *
  // inverse(A).conj().transpose(-2, -1)

  // First: inverse(A)
  DenseTensor inverse_A;
  // A must be square matrices!
  inverse_A.Resize(x.dims());
  dev_ctx.template Alloc<T>(&inverse_A);

  const auto& mat_dims = x.dims();
  const int rank = mat_dims.size();
  int n = mat_dims[rank - 1];
  int64_t total_batch_size = rank > 2 ? x.numel() / (n * n) : 1;

  // Divide the batch into chunks because of cublasMatInv limitation
  if (total_batch_size <= 65536) {
    phi::funcs::MatrixInverseFunctor<Context, T> mat_inv;
    mat_inv(dev_ctx, x, &inverse_A);
  } else {
    constexpr int64_t max_batch_size = 65536;
    int64_t processed = 0;

    VLOG(3) << "Large batch size detected (" << total_batch_size
            << "), processing in chunks of " << max_batch_size;

    while (processed < total_batch_size) {
      int64_t current_batch =
          std::min(max_batch_size, total_batch_size - processed);

      // Extract current batch data
      DenseTensor x_batch;
      x_batch.ShareDataWith(x);
      x_batch.Resize({total_batch_size, n, n});
      x_batch = x_batch.Slice(processed, processed + current_batch);
      x_batch.Resize({current_batch, n, n});

      DenseTensor inverse_batch;
      inverse_batch.Resize({current_batch, n, n});
      dev_ctx.template Alloc<T>(&inverse_batch);

      // Compute the inverse matrix for the current batch
      phi::funcs::MatrixInverseFunctor<Context, T> mat_inv;
      mat_inv(dev_ctx, x_batch, &inverse_batch);

      // Copy the result to the output tensor
      DenseTensor output_slice;
      output_slice.ShareDataWith(inverse_A);
      output_slice.Resize({total_batch_size, n, n});
      output_slice = output_slice.Slice(processed, processed + current_batch);
      output_slice.Resize({current_batch, n, n});

      phi::Copy(
          dev_ctx, inverse_batch, dev_ctx.GetPlace(), false, &output_slice);

      processed += current_batch;
    }
  }

  VLOG(3) << "inverse(A) dims: " << inverse_A.dims();

  // Second: inverse(A).conj()
  auto conj_inverse_A = phi::Conj<T>(dev_ctx, inverse_A);

  VLOG(3) << "inverse(A).conj() dims: " << conj_inverse_A.dims();

  // Third: inverse(A).conj().transpose(-2, -1)
  DenseTensor transpose_inverse_A =
      phi::TransposeLast2Dim<T>(dev_ctx, conj_inverse_A);
  VLOG(3) << "inverse(A).conj().transpose(-2, -1) dims: "
          << transpose_inverse_A.dims();

  // Fourth: split grad value to [sign_grad, absslogdet_grad]
  auto grad_vec = out_grad.Split(1, 0);
  auto det_grad = grad_vec[1];

  // remove useless first dimension
  int det_grad_size = det_grad.dims().size();
  std::vector<int64_t> det_grad_vec;
  for (int64_t i = 1; i < det_grad_size; ++i) {
    det_grad_vec.emplace_back(det_grad.dims()[i]);
  }
  det_grad.Resize(det_grad.dims().reshape(det_grad_vec));

  // Fifth: unsqueeze(dslA, [-1, -2])
  auto unsqueeze1 = phi::funcs::Unsqueeze(det_grad, -1);
  auto unsqueeze2 = phi::funcs::Unsqueeze(unsqueeze1, -2);
  VLOG(3) << "unsqueezed(dslA, [-1, -2]) dims: " << unsqueeze2.dims();

  // Finally: unsqueeze(dslA) * inverse(A)
  auto res = phi::Multiply<T>(dev_ctx, unsqueeze2, transpose_inverse_A);
  VLOG(3) << "unsqueeze(dslA) * inverse(A) dims: " << res.dims();

  phi::Copy(dev_ctx, res, dev_ctx.GetPlace(), false, x_grad);
  x_grad->Resize(x.dims());
  VLOG(3) << "dsl|A| dims: " << x_grad->dims();
}

template <typename T, typename Context>
void SlogDeterminantV2GradKernel(const Context& dev_ctx,
                                 const DenseTensor& x,
                                 const DenseTensor& sign,
                                 const DenseTensor& logdet,
                                 const DenseTensor& sign_grad UNUSED,
                                 const DenseTensor& logdet_grad,
                                 DenseTensor* x_grad) {
  using RealT = typename phi::dtype::Real<T>;
  const auto& x_dims = x.dims();
  const auto& grad_dims = logdet_grad.dims();
  int x_rank = x_dims.size();
  int grad_rank = grad_dims.size();

  PADDLE_ENFORCE_GE(
      x_rank,
      2,
      phi::errors::InvalidArgument(
          "Input tensor X's rank must be at least 2, but received %d.",
          x_rank));

  if (x_rank == 2)
    PADDLE_ENFORCE_EQ(
        grad_rank,
        0,
        phi::errors::InvalidArgument(
            "For a 2D input tensor X, the gradient tensor (logdet_grad) "
            "should be a 0D tensor (scalar), but received rank %d.",
            grad_rank));
  else if (x_rank > 2)
    PADDLE_ENFORCE_EQ(
        grad_rank + 2,
        x_rank,
        phi::errors::InvalidArgument(
            "The rank of gradient tensor (logdet_grad) should be 2 less than "
            "the input tensor X's rank, but received grad rank %d and X rank "
            "%d.",
            grad_rank,
            x_rank));

  dev_ctx.template Alloc<T>(x_grad);
  if (x_grad->numel() == 0) {
    return;
  }

  // Check Whether the matrix is invertible
  // (matrix A not invertible) == (absslogdet(A)=0)
  if (!detail::CheckMatrixInvertible<RealT, Context>(dev_ctx, &logdet)) {
    // The matrix is not invertible
    VLOG(3) << "The input matrix not invertible!";
    phi::Full<T>(dev_ctx,
                 common::vectorize(x.dims()),
                 std::numeric_limits<T>::quiet_NaN(),
                 x_grad);
    return;
  }

  // The matrix is invertible
  // let sl|A| = SlogDeterminant(A)
  // Ref to https://people.maths.ox.ac.uk/gilesm/files/NA-08-01.pdf
  // we set dsl|A| = unsqueeze(dslA, [-1, -2]) *
  // inverse(A).conj().transpose(-2, -1)

  // First: inverse(A)
  DenseTensor inverse_A;
  // A must be square matrices!
  inverse_A.Resize(x_dims);
  dev_ctx.template Alloc<T>(&inverse_A);

  phi::funcs::MatrixInverseFunctor<Context, T> mat_inv;
  mat_inv(dev_ctx, x, &inverse_A);

  VLOG(3) << "inverse(A) dims: " << inverse_A.dims();

  // Second: inverse(A).conj() for complex
  DenseTensor conj_inverse_A;
  if constexpr (is_complex64_or_complex128<T>::value) {
    conj_inverse_A = phi::Conj<T>(dev_ctx, inverse_A);
    VLOG(3) << "Performed complex conjugate.";
  } else {
    conj_inverse_A.ShareDataWith(inverse_A);
    VLOG(3) << "Skipped complex conjugate for real type.";
  }

  VLOG(3) << "inverse(A).conj() dims: " << conj_inverse_A.dims();

  // Third: inverse(A).conj().transpose(-2, -1)
  DenseTensor transpose_inverse_A =
      phi::TransposeLast2Dim<T>(dev_ctx, conj_inverse_A);
  VLOG(3) << "inverse(A).conj().transpose(-2, -1) dims: "
          << transpose_inverse_A.dims();

  DenseTensor logdet_grad_term = logdet_grad;
  if constexpr (is_complex64_or_complex128<T>::value) {
    // change logdet_grad datatype from <RealT> to <ComplexT>
    DenseTensor logdet_grad_complex =
        Empty<T>(dev_ctx, common::vectorize(grad_dims));

    int64_t logdet_numel = logdet_grad.numel();
    phi::funcs::ForRange<Context> for_range(dev_ctx, logdet_numel);
    phi::funcs::RealToComplexFunctor<T> functor(
        logdet_grad.data<RealT>(), logdet_grad_complex.data<T>(), logdet_numel);

    for_range(functor);
    logdet_grad_term = logdet_grad_complex;
  }
  DenseTensor unsqueezed_combined_grad =
      phi::funcs::Unsqueeze(logdet_grad_term, -1);
  unsqueezed_combined_grad =
      phi::funcs::Unsqueeze(unsqueezed_combined_grad, -2);
  VLOG(3) << "unsqueezed_combined_grad dims: "
          << unsqueezed_combined_grad.dims();

  phi::Multiply<T, Context>(
      dev_ctx, unsqueezed_combined_grad, transpose_inverse_A, x_grad);
  VLOG(3) << x_grad->dims();
}

}  // namespace phi
